library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(survival)
library(ggsurvfit)
library(gtsummary)
library(glue)
library(labelled)
library(rpart.plot)
## Loading required package: rpart
library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following object is masked from 'package:gtsummary':
##
## where
##
## The following object is masked from 'package:dplyr':
##
## where
library(partykit)
## Loading required package: libcoin
##
## Attaching package: 'partykit'
##
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
##
## The following object is masked from 'package:purrr':
##
## lift
load("dat1.RData")
load("dat2.RData")
#mutating factor variables and creating labels for baseline characteristic table
table_data = dat1 |>
mutate(gender = factor(gender, levels = c(0,1), labels = c("Female","Male")),
race = factor(race, levels = c(1,2,3,4), labels = c("White","Asian","Black","Hispanic")),
smoking = factor(smoking, levels = c(0,1,2), labels = c("Never smoked", "Former smoker", "Current smoker")),
diabetes = factor(diabetes, levels = c(0,1), labels = c("No","Yes")),
hypertension = factor(hypertension, levels = c(0,1), labels = c("No", "Yes")))
#creating table labels
var_label(table_data) = list(
age = "Age (in years)",
gender = "Gender",
race = "Race/ethnicity",
smoking = "Smoking status",
height = "Height (in centimeters)",
weight = "Weight (in kilograms)",
bmi = "BMI (body mass index)",
diabetes = "Diabetes",
hypertension = "Hypertension",
SBP = "Systolic Blood Pressure (mmHg)",
LDL = "LDL Cholesterol (mg/dL)",
time = "Time since vaccination (in days)"
)
#outputting table
table = table_data |>
select(age, gender, race, smoking, height, weight, bmi, diabetes, hypertension, SBP, LDL, time) |>
tbl_summary(
missing_text = "(Missing)",
statistic = list(all_continuous() ~ "{median} ± {sd}")
)
print(table)
## <div id="eaasytoupi" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
## <style>#eaasytoupi table {
## font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
## -webkit-font-smoothing: antialiased;
## -moz-osx-font-smoothing: grayscale;
## }
##
## #eaasytoupi thead, #eaasytoupi tbody, #eaasytoupi tfoot, #eaasytoupi tr, #eaasytoupi td, #eaasytoupi th {
## border-style: none;
## }
##
## #eaasytoupi p {
## margin: 0;
## padding: 0;
## }
##
## #eaasytoupi .gt_table {
## display: table;
## border-collapse: collapse;
## line-height: normal;
## margin-left: auto;
## margin-right: auto;
## color: #333333;
## font-size: 16px;
## font-weight: normal;
## font-style: normal;
## background-color: #FFFFFF;
## width: auto;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #A8A8A8;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #A8A8A8;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_caption {
## padding-top: 4px;
## padding-bottom: 4px;
## }
##
## #eaasytoupi .gt_title {
## color: #333333;
## font-size: 125%;
## font-weight: initial;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-color: #FFFFFF;
## border-bottom-width: 0;
## }
##
## #eaasytoupi .gt_subtitle {
## color: #333333;
## font-size: 85%;
## font-weight: initial;
## padding-top: 3px;
## padding-bottom: 5px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-color: #FFFFFF;
## border-top-width: 0;
## }
##
## #eaasytoupi .gt_heading {
## background-color: #FFFFFF;
## text-align: center;
## border-bottom-color: #FFFFFF;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_bottom_border {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_col_headings {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_col_heading {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 6px;
## padding-left: 5px;
## padding-right: 5px;
## overflow-x: hidden;
## }
##
## #eaasytoupi .gt_column_spanner_outer {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## padding-top: 0;
## padding-bottom: 0;
## padding-left: 4px;
## padding-right: 4px;
## }
##
## #eaasytoupi .gt_column_spanner_outer:first-child {
## padding-left: 0;
## }
##
## #eaasytoupi .gt_column_spanner_outer:last-child {
## padding-right: 0;
## }
##
## #eaasytoupi .gt_column_spanner {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 5px;
## overflow-x: hidden;
## display: inline-block;
## width: 100%;
## }
##
## #eaasytoupi .gt_spanner_row {
## border-bottom-style: hidden;
## }
##
## #eaasytoupi .gt_group_heading {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## text-align: left;
## }
##
## #eaasytoupi .gt_empty_group_heading {
## padding: 0.5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: middle;
## }
##
## #eaasytoupi .gt_from_md > :first-child {
## margin-top: 0;
## }
##
## #eaasytoupi .gt_from_md > :last-child {
## margin-bottom: 0;
## }
##
## #eaasytoupi .gt_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## margin: 10px;
## border-top-style: solid;
## border-top-width: 1px;
## border-top-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## overflow-x: hidden;
## }
##
## #eaasytoupi .gt_stub {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #eaasytoupi .gt_stub_row_group {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## vertical-align: top;
## }
##
## #eaasytoupi .gt_row_group_first td {
## border-top-width: 2px;
## }
##
## #eaasytoupi .gt_row_group_first th {
## border-top-width: 2px;
## }
##
## #eaasytoupi .gt_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #eaasytoupi .gt_first_summary_row {
## border-top-style: solid;
## border-top-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_first_summary_row.thick {
## border-top-width: 2px;
## }
##
## #eaasytoupi .gt_last_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_grand_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #eaasytoupi .gt_first_grand_summary_row {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-style: double;
## border-top-width: 6px;
## border-top-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_last_grand_summary_row_top {
## padding-top: 8px;
## padding-bottom: 8px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: double;
## border-bottom-width: 6px;
## border-bottom-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_striped {
## background-color: rgba(128, 128, 128, 0.05);
## }
##
## #eaasytoupi .gt_table_body {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_footnotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_footnote {
## margin: 0px;
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #eaasytoupi .gt_sourcenotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #eaasytoupi .gt_sourcenote {
## font-size: 90%;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #eaasytoupi .gt_left {
## text-align: left;
## }
##
## #eaasytoupi .gt_center {
## text-align: center;
## }
##
## #eaasytoupi .gt_right {
## text-align: right;
## font-variant-numeric: tabular-nums;
## }
##
## #eaasytoupi .gt_font_normal {
## font-weight: normal;
## }
##
## #eaasytoupi .gt_font_bold {
## font-weight: bold;
## }
##
## #eaasytoupi .gt_font_italic {
## font-style: italic;
## }
##
## #eaasytoupi .gt_super {
## font-size: 65%;
## }
##
## #eaasytoupi .gt_footnote_marks {
## font-size: 75%;
## vertical-align: 0.4em;
## position: initial;
## }
##
## #eaasytoupi .gt_asterisk {
## font-size: 100%;
## vertical-align: 0;
## }
##
## #eaasytoupi .gt_indent_1 {
## text-indent: 5px;
## }
##
## #eaasytoupi .gt_indent_2 {
## text-indent: 10px;
## }
##
## #eaasytoupi .gt_indent_3 {
## text-indent: 15px;
## }
##
## #eaasytoupi .gt_indent_4 {
## text-indent: 20px;
## }
##
## #eaasytoupi .gt_indent_5 {
## text-indent: 25px;
## }
##
## #eaasytoupi .katex-display {
## display: inline-flex !important;
## margin-bottom: 0.75em !important;
## }
##
## #eaasytoupi div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
## height: 0px !important;
## }
## </style>
## <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
## <thead>
## <tr class="gt_col_headings">
## <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="label"><span class='gt_from_md'><strong>Characteristic</strong></span></th>
## <th class="gt_col_heading gt_columns_bottom_border gt_center" rowspan="1" colspan="1" scope="col" id="stat_0"><span class='gt_from_md'><strong>N = 5,000</strong></span><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span></th>
## </tr>
## </thead>
## <tbody class="gt_table_body">
## <tr><td headers="label" class="gt_row gt_left">Age (in years)</td>
## <td headers="stat_0" class="gt_row gt_center">60.0 ± 4.5</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Gender</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Female</td>
## <td headers="stat_0" class="gt_row gt_center">2,573 (51%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Male</td>
## <td headers="stat_0" class="gt_row gt_center">2,427 (49%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Race/ethnicity</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> White</td>
## <td headers="stat_0" class="gt_row gt_center">3,221 (64%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Asian</td>
## <td headers="stat_0" class="gt_row gt_center">278 (5.6%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Black</td>
## <td headers="stat_0" class="gt_row gt_center">1,036 (21%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Hispanic</td>
## <td headers="stat_0" class="gt_row gt_center">465 (9.3%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Smoking status</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Never smoked</td>
## <td headers="stat_0" class="gt_row gt_center">3,010 (60%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Former smoker</td>
## <td headers="stat_0" class="gt_row gt_center">1,504 (30%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Current smoker</td>
## <td headers="stat_0" class="gt_row gt_center">486 (9.7%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Height (in centimeters)</td>
## <td headers="stat_0" class="gt_row gt_center">170.1 ± 5.9</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Weight (in kilograms)</td>
## <td headers="stat_0" class="gt_row gt_center">80 ± 7</td></tr>
## <tr><td headers="label" class="gt_row gt_left">BMI (body mass index)</td>
## <td headers="stat_0" class="gt_row gt_center">27.60 ± 2.76</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Diabetes</td>
## <td headers="stat_0" class="gt_row gt_center">772 (15%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Hypertension</td>
## <td headers="stat_0" class="gt_row gt_center">2,298 (46%)</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Systolic Blood Pressure (mmHg)</td>
## <td headers="stat_0" class="gt_row gt_center">130 ± 8</td></tr>
## <tr><td headers="label" class="gt_row gt_left">LDL Cholesterol (mg/dL)</td>
## <td headers="stat_0" class="gt_row gt_center">110 ± 20</td></tr>
## <tr><td headers="label" class="gt_row gt_left">Time since vaccination (in days)</td>
## <td headers="stat_0" class="gt_row gt_center">106 ± 43</td></tr>
## </tbody>
##
## <tfoot class="gt_footnotes">
## <tr>
## <td class="gt_footnote" colspan="2"><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span> <span class='gt_from_md'>Median ± SD; n (%)</span></td>
## </tr>
## </tfoot>
## </table>
## </div>
##Distribution of Log Antibody Levels by Race/Ethnicity
age_antibody <- ggplot(table_data, aes(x = age, y = log_antibody)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Age", y = "Average Log Antibody Level",
title = "Average Log Antibody Levels by Age") +
theme_minimal()
age_antibody
#Average Log Antibody Levels by Gender"
gender_antibody <- ggplot(table_data, aes(x = factor(gender), y = log_antibody)) +
geom_boxplot() +
labs(x = "Gender", y = "Log Antibody Level") +
theme_minimal()
gender_antibody
#Distribution of Log Antibody Levels by Race/Ethnicity
race_antibody <- ggplot(table_data, aes(x = log_antibody, fill = factor(race))) +
geom_histogram(binwidth = 0.1, position = "identity", alpha = 0.6, color = "black") +
labs(x = "Log Antibody Level", y = "Count", title = "Distribution of Log Antibody Levels by Race/Ethnicity") +
scale_fill_manual(values = c("blue", "orange", "green", "purple"),
labels = c("1" = "White", "2" = "Asian", "3" = "Black", "4" = "Hispanic")) +
theme_minimal() +
theme(legend.title = element_blank())
race_antibody
#Average Log Antibody Levels by Smoking Status
smoking_antibody <- ggplot(table_data, aes(x = factor(smoking), y = log_antibody)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Smoking Status", y = "Average Log Antibody Level",
title = "Average Log Antibody Levels by Smoking Status") +
scale_x_discrete(labels = c("0" = "Never Smoked",
"1" = "Former Smoker",
"2" = "Current Smoker")) +
theme_minimal()
smoking_antibody
#Distribution of Log Antibody Levels by BMI
bmi_antibody <- ggplot(table_data, aes_string(x = "bmi", y = "log_antibody")) +
geom_point(color = "darkgreen", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
theme_bw() +
labs(x = "BMI", y = "Log Antibody Levels")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
bmi_antibody
## `geom_smooth()` using formula = 'y ~ x'
#Average Log Antibody levels by diabetes status
diabetes_antibody <- ggplot(table_data, aes(x = factor(diabetes), y = log_antibody)) +
geom_boxplot() +
labs(x = "Diabetes", y = "Log Antibody Level") +
theme_minimal()
diabetes_antibody
#Average Log Antibody levels by hypertension status
hypertension_antibody <- ggplot(table_data, aes(x = factor(hypertension), y = log_antibody)) +
geom_boxplot() +
labs(x = "Hypertension", y = "Log Antibody Level") +
theme_minimal()
hypertension_antibody
#Distribution of Log Antibody Levels by SBP
sbp_antibody <- ggplot(table_data, aes_string(x = "SBP", y = "log_antibody")) +
geom_point(color = "darkgreen", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
theme_bw() +
labs(x = "SBP", y = "Log Antibody Levels")
sbp_antibody
## `geom_smooth()` using formula = 'y ~ x'
#Distribution of Log Antibody Levels by LDL
LDL_antibody <- ggplot(table_data, aes_string(x = "LDL", y = "log_antibody")) +
geom_point(color = "darkgreen", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
theme_bw() +
labs(x = "LDL", y = "Log Antibody Levels")
LDL_antibody
## `geom_smooth()` using formula = 'y ~ x'
#Distribution of Log Antibody Levels by Time Since Vaccination
time_antibody <- ggplot(table_data, aes_string(x = "time", y = "log_antibody")) +
geom_point(color = "darkgreen", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.5, color = "red", se = FALSE) +
theme_bw() +
labs(x = "Time Since Vaccination", y = "Log Antibody Levels")
time_antibody
## `geom_smooth()` using formula = 'y ~ x'
#age & antibody (p <2e-16 for intercept and age)
b <- lm(log_antibody ~ age, data = table_data)
summary(b)
##
## Call:
## lm(formula = log_antibody ~ age, data = table_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35812 -0.38225 0.02436 0.41087 1.87772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.260010 0.111368 101.11 <2e-16 ***
## age -0.019938 0.001852 -10.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5897 on 4998 degrees of freedom
## Multiple R-squared: 0.02267, Adjusted R-squared: 0.02247
## F-statistic: 115.9 on 1 and 4998 DF, p-value: < 2.2e-16
#gender & antibody (t = 17.523, df = 4972.7, p-value < 2.2e-16)
t.test(log_antibody ~ gender, data = table_data)
##
## Welch Two Sample t-test
##
## data: log_antibody by gender
## t = 17.523, df = 4972.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## 0.2550640 0.3193275
## sample estimates:
## mean in group Female mean in group Male
## 10.20375 9.91655
#race & antibody (Pr(>F) = 0.367)
anova_race <- aov(log_antibody ~ race, data = table_data)
summary(anova_race)
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 1.1 0.3754 1.055 0.367
## Residuals 4996 1777.5 0.3558
#smoking & antibody (Pr(>F) = 1.95e-10)
anova_smoke <- aov(log_antibody ~ smoking, data = table_data)
summary(anova_smoke)
## Df Sum Sq Mean Sq F value Pr(>F)
## smoking 2 15.8 7.922 22.46 1.95e-10 ***
## Residuals 4997 1762.8 0.353
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# bmi & antibody (p-value: < 2.2e-16)
bmi_linear <- lm(log_antibody ~ bmi, data = table_data)
summary(bmi_linear)
##
## Call:
## lm(formula = log_antibody ~ bmi, data = table_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10461 -0.37352 0.02197 0.39745 1.74852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.41935 0.08307 137.47 <2e-16 ***
## bmi -0.04885 0.00298 -16.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5811 on 4998 degrees of freedom
## Multiple R-squared: 0.05102, Adjusted R-squared: 0.05083
## F-statistic: 268.7 on 1 and 4998 DF, p-value: < 2.2e-16
#diabetes & antibody p-value = 0.6828
t.test(log_antibody ~ diabetes, data = table_data)
##
## Welch Two Sample t-test
##
## data: log_antibody by diabetes
## t = -0.40872, df = 1077.9, p-value = 0.6828
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -0.05498657 0.03602815
## sample estimates:
## mean in group No mean in group Yes
## 10.06288 10.07236
#hypertension & antibody p-value = 5.491e-05
t.test(log_antibody ~ hypertension, data = table_data)
##
## Welch Two Sample t-test
##
## data: log_antibody by hypertension
## t = 4.0373, df = 4891.7, p-value = 5.491e-05
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 0.03505772 0.10124460
## sample estimates:
## mean in group No mean in group Yes
## 10.09566 10.02751
#sbp and antibody p-value: 1.451e-05
sbp_linear <- lm(log_antibody ~ SBP, data = table_data)
summary(sbp_linear)
##
## Call:
## lm(formula = log_antibody ~ SBP, data = table_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.27107 -0.38295 0.02501 0.40767 1.91119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.657727 0.136973 77.81 < 2e-16 ***
## SBP -0.004568 0.001052 -4.34 1.45e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5954 on 4998 degrees of freedom
## Multiple R-squared: 0.003755, Adjusted R-squared: 0.003556
## F-statistic: 18.84 on 1 and 4998 DF, p-value: 1.451e-05
#LDL and antibody p-value: 0.01143
LDL_linear <- lm(log_antibody ~ LDL, data = table_data)
summary(LDL_linear)
##
## Call:
## lm(formula = log_antibody ~ LDL, data = table_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.27236 -0.38601 0.02812 0.41348 1.85900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.1807375 0.0467688 217.68 <2e-16 ***
## LDL -0.0010590 0.0004186 -2.53 0.0114 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5962 on 4998 degrees of freedom
## Multiple R-squared: 0.001279, Adjusted R-squared: 0.001079
## F-statistic: 6.402 on 1 and 4998 DF, p-value: 0.01143
#time and antibody p-value: 0.3278
time_linear <- lm(log_antibody ~ time, data = table_data)
summary(time_linear)
##
## Call:
## lm(formula = log_antibody ~ time, data = table_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3137 -0.3854 0.0251 0.4127 1.8965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.0850448 0.0227735 442.842 <2e-16 ***
## time -0.0001902 0.0001943 -0.979 0.328
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5965 on 4998 degrees of freedom
## Multiple R-squared: 0.0001916, Adjusted R-squared: -8.417e-06
## F-statistic: 0.9579 on 1 and 4998 DF, p-value: 0.3278
x <- model.matrix(log_antibody ~ ., dat1) [, -1]
y <- dat1$log_antibody
x2 <- model.matrix(log_antibody ~ ., dat2) [, -1]
y2 <- dat2$log_antibody
#cross-validation
ctrl1 = trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
selectionFunction = "best")
#GAM
set.seed(2)
gam.fit <- train(x, y,
method = "gam",
trControl = ctrl1)
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
gam.fit$bestTune
## select method
## 2 TRUE GCV.Cp
gam.fit$finalModel
##
## Family: gaussian
## Link function: identity
##
## Formula:
## .outcome ~ gender + race2 + race3 + race4 + smoking1 + smoking2 +
## diabetes + hypertension + s(age) + s(SBP) + s(LDL) + s(bmi) +
## s(time) + s(height) + s(weight) + s(id)
##
## Estimated degrees of freedom:
## 0.991 0.000 0.000 4.661 7.846 1.216 0.000
## 0.000 total = 23.71
##
## GCV score: 0.2786709
gam.m1 <- gam(log_antibody ~ time + age + gender + race + smoking + bmi + diabetes + hypertension +
SBP + LDL, data = dat1)
gam.m2 <- gam(log_antibody ~ s(time) + age + gender + race + smoking + bmi + diabetes + hypertension +
SBP + LDL, data = dat1)
anova(gam.m1, gam.m2, test = "F")
## Analysis of Deviance Table
##
## Model 1: log_antibody ~ time + age + gender + race + smoking + bmi + diabetes +
## hypertension + SBP + LDL
## Model 2: log_antibody ~ s(time) + age + gender + race + smoking + bmi +
## diabetes + hypertension + SBP + LDL
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 4986.0 1520.6
## 2 4978.5 1407.0 7.4964 113.57 53.614 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(gam.m2)
#Summary of the model
summary(gam.fit)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## .outcome ~ gender + race2 + race3 + race4 + smoking1 + smoking2 +
## diabetes + hypertension + s(age) + s(SBP) + s(LDL) + s(bmi) +
## s(time) + s(height) + s(weight) + s(id)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.228204 0.015328 667.284 < 2e-16 ***
## gender -0.297827 0.014933 -19.944 < 2e-16 ***
## race2 -0.002962 0.033010 -0.090 0.928
## race3 -0.010567 0.018838 -0.561 0.575
## race4 -0.037352 0.026175 -1.427 0.154
## smoking1 0.022213 0.016659 1.333 0.182
## smoking2 -0.193179 0.025834 -7.478 8.88e-14 ***
## diabetes 0.014216 0.020639 0.689 0.491
## hypertension -0.007766 0.015995 -0.486 0.627
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 9.910e-01 9 13.737 <2e-16 ***
## s(SBP) 5.214e-07 9 0.000 0.758
## s(LDL) 5.607e-07 9 0.000 0.634
## s(bmi) 4.661e+00 9 42.117 <2e-16 ***
## s(time) 7.846e+00 9 44.897 <2e-16 ***
## s(height) 1.216e+00 9 0.277 0.119
## s(weight) 1.746e-06 9 0.000 0.612
## s(id) 5.058e-07 9 0.000 0.731
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.22 Deviance explained = 22.4%
## GCV = 0.27867 Scale est. = 0.27735 n = 5000
#MARS
set.seed(2)
mars_grid <- expand.grid(degree = 1:3,
nprune = 2:24)
mars.fit <- train(x, y,
method = "earth",
tuneGrid = mars_grid,
trControl = ctrl1)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
#pcr using caret
set.seed(2)
pcr_fit = train(x,y,
method = "pcr",
tuneGrid = data.frame(ncomp=1:5),
trControl = ctrl1,
preProcess = c("center","scale"))
#pls using caret
pls_fit = train(x,y,
method="pls",
tunegrid = data.frame(ncomp=1:5),
trControl = ctrl1,
preProcess = c("center","scale"))
#enet using caret
enet_fit = train(log_antibody ~.,
data = dat1,
method = "glmnet",
tuneGrid = expand.grid(alpha=seq(0,1, length = 21),
lambda = exp(seq(6,0,length = 100))),
trControl = ctrl1)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#lasso using caret
set.seed(2)
lasso.fit = train(log_antibody ~.,
data = dat1,
method = "glmnet",
tuneGrid = expand.grid(alpha=1,
lambda = exp(seq(6,0,length=100))),
trControl = ctrl1)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#lm using caret
lm.fit = train(log_antibody ~.,
data = dat1,
method = "lm",
trControl = ctrl1)
#ridge using caret
ridge.fit = train(log_antibody ~.,
data = dat1,
method = "glmnet",
tuneGrid = expand.grid(alpha=1,
lambda = exp(seq(6,0,length=100))),
trControl = ctrl1)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#comparing models based on resampling results
resamp = resamples(list(lasso = lasso.fit,
lm = lm.fit,
ridge = ridge.fit))
bwplot(resamp, metric = "RMSE")
#comparing models based on resampling results
resamp = resamples(list(pcr = pcr_fit,
pls = pls_fit,
enet = enet_fit,
gam = gam.fit,
mars = mars.fit,
lasso = lasso.fit,
lm = lm.fit,
ridge = ridge.fit))
bwplot(resamp, metric = "RMSE")
Based on the resampling results, the MARS model is the “best” fit as it
has the lowest RMSE.
#creating a piece wise linear model using multivairiate adaptive regression splines (MARS)
ggplot(mars.fit)
mars.fit$bestTune
## nprune degree
## 8 9 1
coef(mars.fit$finalModel)
## (Intercept) h(27.8-bmi) h(time-57) h(57-time) gender h(age-59)
## 10.847446930 -0.061997354 -0.002254182 -0.033529326 -0.296290451 -0.022957648
## h(59-age) smoking2 h(bmi-23.7)
## 0.016138468 -0.205126851 -0.084380175
#understanding the relationship between the coefficients from the final model and lpsa
p1 = pdp::partial(mars.fit, pred.var =c("time"), grid.resolution=10) |> autoplot()
p2 = pdp::partial(mars.fit, pred.var =c("smoking2"), grid.resolution=10) |> autoplot()
p3 = pdp::partial(mars.fit, pred.var =c("gender"), grid.resolution=10) |> autoplot()
p4 = pdp::partial(mars.fit, pred.var = c("age","bmi"), grid.resolution = 10) |>
pdp::plotPartial(levelplot = FALSE, zlab="yhat", drape=TRUE, screen = list(z=20, x=-60))
gridExtra::grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)
#evaluating final model
predy2_mars2 = predict(mars.fit, newdata=x2)
mean((y2 - predy2_mars2)^2)
## [1] 0.2838458
The MSE for the final model when using the test data is 0.2838458.